{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import math\n",
    "import gensim.models as gs\n",
    "import pickle as pk\n",
    "import sklearn.metrics as met\n",
    "import scipy.stats as stats\n",
    "import numpy as np\n",
    "\n",
    "from sklearn import cross_validation\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "import twitter_sentiment_dataset as tsd\n",
    "import phrase2vec as p2v\n",
    "from twitter_sentiment_dataset import TweetTrainingExample\n",
    "from model import ModelParams"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup\n",
    "Load the three vector representations from files. In general, any variable with the word 'none' in it refers to Google News word2vec w/o any emoji vectors, 'ours' to Google News word2vec w/ vectors we trained, and 'theirs' to Google News word2vec with the vectors trained by Barbieri et. al."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "w2v_path='./data/word2vec/google_w2v_without_emoji.bin'\n",
    "\n",
    "in_dim = 300   # Length of word2vec vectors\n",
    "out_dim = 300  # Desired dimension of output vectors\n",
    "pos_ex = 4\n",
    "neg_ratio = 1\n",
    "max_epochs = 40\n",
    "dropout = 0.0\n",
    "\n",
    "params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs,\n",
    "                    neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5)\n",
    "\n",
    "\n",
    "\n",
    "e2v_ours_path = params.model_folder('unicode') + '/emoji2vec.bin'\n",
    "e2v_theirs_path = './data/word2vec/emoji_subset_theirs.bin'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "w2v = gs.Word2Vec.load_word2vec_format(w2v_path, binary=True)\n",
    "e2v_ours = gs.Word2Vec.load_word2vec_format(e2v_ours_path, binary=True)\n",
    "e2v_theirs = gs.Word2Vec.load_word2vec_format(e2v_theirs_path, binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p2v_no_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=None)\n",
    "p2v_our_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_ours)\n",
    "p2v_their_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_theirs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using stats scraped from emojitracker.com at a certain point in time, we generate two sets of emoji: the top 173 most frequently used emoji, whose usage constitutes 90% of emoji usage on Twitter, and the bottom 612 least frequently used emoji, whose usage constitutes 10% of emoji usage on Twitter.\n",
    "\n",
    "Subsequently, 'common' will refer to the former group, while 'rare' will refer to the latter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "p = open('./data/tweets/frequencies_w_emoji.txt', 'r')\n",
    "ems = p.readlines()\n",
    "ems = [l.split('\\t')[0] for l in ems]\n",
    "p.close()\n",
    "top90 = set(ems[:173])\n",
    "bottom10 = set(ems[173:])\n",
    "p.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def emoji_dataset_stats(tweets):\n",
    "    total_tweets = len(tweets)\n",
    "    total_emoji = tsd.num_tweets_with_emoji(tweets, e2v_ours, e2v_theirs, ems)\n",
    "    top_90_total = tsd.num_tweets_with_emoji(tweets, set(), set(), top90)\n",
    "    bottom_10_total = tsd.num_tweets_with_emoji(tweets, set(), set(), bottom10)\n",
    "    return total_tweets, total_emoji, top_90_total, bottom_10_total"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Statistics for the entire Twitter corpus. Counts refer to # of tweets containing emoji of a type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All Tweets in corpus: 64599, total emoji: 11700, common emoji: 11135, rare emoji: 1578\n",
      "Training set: total tweets: 51679, total emoji: 9405, common emoji: 8949, rare emoji: 1270\n",
      "Test set: total tweets: 12920, total emoji: 2295, common emoji: 2186, rare emoji: 308\n"
     ]
    }
   ],
   "source": [
    "train_tweets, test_tweets = tsd.load_training_test_sets()\n",
    "print('All Tweets in corpus: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(tsd.get_all_examples()))\n",
    "print('Training set: total tweets: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(train_tweets))\n",
    "print('Test set: total tweets: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(test_tweets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def emoji_dataset_label_stats(tweets):\n",
    "    res = dict()\n",
    "    res['Positive'] = 0\n",
    "    res['Negative'] = 0\n",
    "    res['Neutral'] = 0\n",
    "    for tweet in tweets:\n",
    "        res[tweet.label] += 1/len(tweets)\n",
    "    print(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Neutral': 0.45970316763111974, 'Positive': 0.2882215213142463, 'Negative': 0.25207531105472025}\n",
      "{'Neutral': 0.4616873065016006, 'Positive': 0.2876160990712081, 'Negative': 0.25069659442723424}\n"
     ]
    }
   ],
   "source": [
    "emoji_dataset_label_stats(train_tweets)\n",
    "emoji_dataset_label_stats(test_tweets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare Training and Testing Vectors\n",
    "Given the raw training and test tweets, calculate the vector representations for each tweet for each model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_none, train_y = tsd.prepare_tweet_vector_averages(train_tweets, p2v_no_emoji)\n",
    "train_ours, _ = tsd.prepare_tweet_vector_averages(train_tweets, p2v_our_emoji)\n",
    "train_theirs, _ = tsd.prepare_tweet_vector_averages(train_tweets, p2v_their_emoji)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test_none, test_y = tsd.prepare_tweet_vector_averages(test_tweets, p2v_no_emoji)\n",
    "test_ours, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_our_emoji)\n",
    "test_theirs, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_their_emoji)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifiers = {\n",
    "    'SGD (n_iter=50)' : SGDClassifier(n_iter=50),\n",
    "    'Random Forest (n_estimators=60)' : RandomForestClassifier(n_estimators=60)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5):\n",
    "    scores_none = cross_validation.cross_val_score(clf, train_none, train_y, cv=cv)\n",
    "    print(\"None: %s Train Accuracy: %0.2f (+/- %0.3f)\" % (clf_name, scores_none.mean(), scores_none.std() * 2))\n",
    "    \n",
    "    scores_ours = cross_validation.cross_val_score(clf, train_ours, train_y, cv=cv)\n",
    "    print(\"Ours: %s Train Accuracy: %0.2f (+/- %0.3f)\" % (clf_name, scores_ours.mean(), scores_ours.std() * 2))\n",
    "    \n",
    "    scores_theirs = cross_validation.cross_val_score(clf, train_theirs, train_y, cv=cv)\n",
    "    print(\"Theirs: %s Train Accuracy: %0.2f (+/- %0.3f)\" % (clf_name, scores_theirs.mean(), scores_theirs.std() * 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_and_predict(train_data, train_y, test_data, test_y, clf):\n",
    "    clf.fit(train_data, train_y)\n",
    "    predictions = clf.predict(test_data)\n",
    "    score = met.accuracy_score(test_y, predictions)\n",
    "    f1 = met.f1_score(test_y, predictions, average='weighted')\n",
    "    return predictions, score, f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name):\n",
    "    none_pred, none_acc, none_f1 = train_and_predict(train_none, train_y, test_none, test_y, clf)\n",
    "    print('None: %s Test Accuracy: %0.5f, f1=%0.5f' % (clf_name, none_acc, none_f1))\n",
    "    \n",
    "    ours_pred, ours_acc, ours_f1 = train_and_predict(train_ours, train_y, test_ours, test_y, clf)\n",
    "    ours_p = tsd.calculate_mcnemars(none_pred, ours_pred, test_y)\n",
    "    print('Ours: %s Test Accuracy: %0.5f, p=%0.5f, f1=%0.5f' % (clf_name, ours_acc, ours_p, ours_f1))\n",
    "    \n",
    "    theirs_pred, theirs_acc, theirs_f1 = train_and_predict(train_theirs, train_y, test_theirs, test_y, clf)\n",
    "    theirs_p = tsd.calculate_mcnemars(none_pred, theirs_pred, test_y)\n",
    "    print('Theirs: %s Test Accuracy: %0.5f, p=%0.5f, f1=%0.5f' % (clf_name, theirs_acc, theirs_p, theirs_f1))\n",
    "    \n",
    "    ours_theirs_p = tsd.calculate_mcnemars(ours_pred, theirs_pred, test_y)\n",
    "    print('Significance between ours and theirs: p=%0.5f' % ours_theirs_p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Performance on Training Set and Complete Test Set\n",
    "For each classifier, we calculate the average performance of the classifier on the training set when cross validation is applied, as well as the accuracy on the complete test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Forest (n_estimators=60)\n",
      "\n",
      "Cross Validation Accuracy on Training Set\n",
      "\n",
      "None: Random Forest (n_estimators=60) Train Accuracy: 0.57 (+/- 0.009)\n",
      "Ours: Random Forest (n_estimators=60) Train Accuracy: 0.59 (+/- 0.010)\n",
      "Theirs: Random Forest (n_estimators=60) Train Accuracy: 0.58 (+/- 0.006)\n",
      "\n",
      "Accuracy on Test Set\n",
      "\n",
      "None: Random Forest (n_estimators=60) Test Accuracy: 0.57508, f1=0.55796\n",
      "Ours: Random Forest (n_estimators=60) Test Accuracy: 0.59543, p=0.00000, f1=0.58180\n",
      "Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.58220, p=0.07141, f1=0.56749\n",
      "Significance between ours and theirs: p=0.00052\n",
      "\n",
      "SGD (n_iter=50)\n",
      "\n",
      "Cross Validation Accuracy on Training Set\n",
      "\n",
      "None: SGD (n_iter=50) Train Accuracy: 0.59 (+/- 0.013)\n",
      "Ours: SGD (n_iter=50) Train Accuracy: 0.60 (+/- 0.008)\n",
      "Theirs: SGD (n_iter=50) Train Accuracy: 0.60 (+/- 0.009)\n",
      "\n",
      "Accuracy on Test Set\n",
      "\n",
      "None: SGD (n_iter=50) Test Accuracy: 0.58545, f1=0.56046\n",
      "Ours: SGD (n_iter=50) Test Accuracy: 0.60534, p=0.00000, f1=0.58556\n",
      "Theirs: SGD (n_iter=50) Test Accuracy: 0.60008, p=0.00000, f1=0.57890\n",
      "Significance between ours and theirs: p=0.01279\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for clf_name, clf in classifiers.items():\n",
    "    print(clf_name)\n",
    "    \n",
    "    print()\n",
    "    \n",
    "    print('Cross Validation Accuracy on Training Set\\n')\n",
    "    train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5)\n",
    "    \n",
    "    print()\n",
    "    \n",
    "    print('Accuracy on Test Set\\n')\n",
    "    train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name)\n",
    "    \n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_and_predict_all_on_test_subset(test_tweets, clf, clf_name):\n",
    "    test_none, test_y = tsd.prepare_tweet_vector_averages(test_tweets, p2v_no_emoji)\n",
    "    test_ours, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_our_emoji)\n",
    "    test_theirs, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_their_emoji)\n",
    "\n",
    "    train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "emoji_test_tweets = tsd.get_tweets_with_emoji(test_tweets, e2v_ours, e2v_theirs, ems)\n",
    "emoji_test_tweets_top90 = tsd.get_tweets_with_emoji(test_tweets, set(), set(), top90)\n",
    "emoji_test_tweets_bottom10 = tsd.get_tweets_with_emoji(test_tweets, set(), set(), bottom10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test Subset - All Tweets with Emoji\n",
    "For each classifier, we calculate the accuracy on the subset of test examples that contain emoji."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Forest (n_estimators=60)\n",
      "None: Random Forest (n_estimators=60) Test Accuracy: 0.46057, f1=0.46982\n",
      "Ours: Random Forest (n_estimators=60) Test Accuracy: 0.54423, p=0.00000, f1=0.55362\n",
      "Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.52462, p=0.00000, f1=0.53852\n",
      "Significance between ours and theirs: p=0.06959\n",
      "\n",
      "SGD (n_iter=50)\n",
      "None: SGD (n_iter=50) Test Accuracy: 0.47146, f1=0.47933\n",
      "Ours: SGD (n_iter=50) Test Accuracy: 0.59216, p=0.00000, f1=0.60082\n",
      "Theirs: SGD (n_iter=50) Test Accuracy: 0.57386, p=0.00000, f1=0.58830\n",
      "Significance between ours and theirs: p=0.02857\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for clf_name, clf in classifiers.items():\n",
    "    print(clf_name)\n",
    "    train_and_predict_all_on_test_subset(emoji_test_tweets, clf, clf_name)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test Subset - All Tweets with Common Emoji\n",
    "For each classifier, we calculate the accuracy on the subset of test examples that contain common (Top 90%) emoji."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Forest (n_estimators=60)\n",
      "None: Random Forest (n_estimators=60) Test Accuracy: 0.47301, f1=0.48576\n",
      "Ours: Random Forest (n_estimators=60) Test Accuracy: 0.55078, p=0.00000, f1=0.56081\n",
      "Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.52836, p=0.00000, f1=0.54292\n",
      "Significance between ours and theirs: p=0.03414\n",
      "\n",
      "SGD (n_iter=50)\n",
      "None: SGD (n_iter=50) Test Accuracy: 0.45151, f1=0.45669\n",
      "Ours: SGD (n_iter=50) Test Accuracy: 0.59515, p=0.00000, f1=0.60276\n",
      "Theirs: SGD (n_iter=50) Test Accuracy: 0.56908, p=0.00000, f1=0.58606\n",
      "Significance between ours and theirs: p=0.00385\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for clf_name, clf in classifiers.items():\n",
    "    print(clf_name)\n",
    "    train_and_predict_all_on_test_subset(emoji_test_tweets_top90, clf, clf_name)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test Subset - All Tweets with Rare Emoji\n",
    "For each classifier, we calculate the accuracy on the subset of test examples that contain rare (Bottom 10%) emoji."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Forest (n_estimators=60)\n",
      "None: Random Forest (n_estimators=60) Test Accuracy: 0.47403, f1=0.46927\n",
      "Ours: Random Forest (n_estimators=60) Test Accuracy: 0.54545, p=0.02626, f1=0.53617\n",
      "Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.53896, p=0.03501, f1=0.54141\n",
      "Significance between ours and theirs: p=0.83657\n",
      "\n",
      "SGD (n_iter=50)\n",
      "None: SGD (n_iter=50) Test Accuracy: 0.43182, f1=0.41199\n",
      "Ours: SGD (n_iter=50) Test Accuracy: 0.55195, p=0.00001, f1=0.55512\n",
      "Theirs: SGD (n_iter=50) Test Accuracy: 0.52922, p=0.00041, f1=0.53032\n",
      "Significance between ours and theirs: p=0.31731\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for clf_name, clf in classifiers.items():\n",
    "    print(clf_name)\n",
    "    train_and_predict_all_on_test_subset(emoji_test_tweets_bottom10, clf, clf_name)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
